{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "3ca8ca7f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import paddle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "51ea4157",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\ANACONDA\\lib\\site-packages\\ipykernel\\ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
      "  and should_run_async(code)\n"
     ]
    }
   ],
   "source": [
    "data = pd.read_csv(\"all_movies_with_id.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5c51448a",
   "metadata": {},
   "outputs": [],
   "source": [
    "comment = data[\"Comment\"][:1000]\n",
    "star = data[\"Star\"][:1000]\n",
    "data=None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "0817298a",
   "metadata": {},
   "outputs": [],
   "source": [
    "label = []\n",
    "for i in star:\n",
    "    if i>=4:\n",
    "        label.append(2)\n",
    "    elif i<=2:\n",
    "        label.append(0)\n",
    "    else:\n",
    "        label.append(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "c8a15c6c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import jieba\n",
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d3c41e05",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "cf2e4d33",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache C:\\Users\\zhang\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 0.671 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    }
   ],
   "source": [
    "cnt = Counter()\n",
    "for sentence in comment:\n",
    "    words = jieba.cut(sentence,cut_all=True)\n",
    "    cnt+=Counter(words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "4533e54e",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "MAX_LEN=30"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "7258fe55",
   "metadata": {},
   "outputs": [],
   "source": [
    "count = 1\n",
    "word_to_id_dict = {}\n",
    "for i in cnt:\n",
    "    word_to_id_dict[i]=count\n",
    "    count+=1\n",
    "word_to_id_dict[\"unk\"] = count\n",
    "word_to_id_dict[\"<pad>\"] = 0\n",
    "\n",
    "id_to_word_dict ={j:i for i,j in zip(word_to_id_dict.keys(),word_to_id_dict.values())}\n",
    "word_size = len(id_to_word_dict)\n",
    "\n",
    "dict_size = len(id_to_word_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "efeac5d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_data(word_to_id_dict,comment,label):\n",
    "    train_inputs =[]\n",
    "    test_inputs=[]\n",
    "    for i,k in zip(comment[:-100],label[:-100]):\n",
    "        temp = []\n",
    "        for j in jieba.cut(i):\n",
    "            temp.append(word_to_id_dict.get(j,word_to_id_dict[\"unk\"]))\n",
    "        if len(temp)>MAX_LEN:\n",
    "            temp = temp[:MAX_LEN]\n",
    "        elif len(temp)<MAX_LEN:\n",
    "            temp = temp+[0]*(MAX_LEN-len(temp))\n",
    "        train_inputs.append((temp,k))\n",
    "    for i,k in zip(comment[-100:],label[-100:]):\n",
    "        temp = []\n",
    "        for j in jieba.cut(i):\n",
    "            temp.append(word_to_id_dict.get(j,word_to_id_dict[\"unk\"]))\n",
    "        if len(temp)>MAX_LEN:\n",
    "            temp = temp[:MAX_LEN]\n",
    "        elif len(temp)<MAX_LEN:\n",
    "            temp = temp+[0]*(MAX_LEN-len(temp))\n",
    "        test_inputs.append((temp,k))\n",
    "    \n",
    "    return train_inputs,test_inputs\n",
    "\n",
    "\n",
    "train_data,test_data = get_data(word_to_id_dict,comment,label)\n",
    "\n",
    "    \n",
    "        \n",
    "            \n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "2ac2ccd1",
   "metadata": {},
   "outputs": [],
   "source": [
    "class MyTextDataset(paddle.io.Dataset):\n",
    "    def __init__(self,data):\n",
    "        super().__init__()\n",
    "        self.data = data\n",
    "        self.length = len(data)\n",
    "    def __getitem__(self,index):\n",
    "        input_x = self.data[index][0]\n",
    "        label = self.data[index][1]\n",
    "        return np.array(input_x),label\n",
    "#         return input_x,label\n",
    "    def __len__(self):\n",
    "        return self.length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d6f3de27",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c10d5ea",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "6d349e1c",
   "metadata": {},
   "outputs": [],
   "source": [
    "mydata_train = MyTextDataset(train_data)\n",
    "mydata_test = MyTextDataset(test_data)\n",
    "\n",
    "\n",
    "train_dataload = paddle.io.DataLoader(mydata_train,batch_size=64,shuffle=True,drop_last=False)\n",
    "test_dataload =  paddle.io.DataLoader(mydata_test,batch_size=64,shuffle=True,drop_last=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c012b178",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "1dffb4af",
   "metadata": {},
   "outputs": [],
   "source": [
    "class MyModel(paddle.nn.Layer):\n",
    "    def __init__(self,dict_size):\n",
    "        super().__init__()\n",
    "        \n",
    "        self.embedding = paddle.nn.Embedding(dict_size,128)\n",
    "        self.rnn = paddle.nn.LSTM(input_size=128,hidden_size=128,num_layers=1,dropout=0.2)\n",
    "        self.linear = paddle.nn.Linear(128,3)\n",
    "    def forward(self,x):\n",
    "        \n",
    "        x= self.embedding(x)\n",
    "        out,_ = self.rnn(x)\n",
    "        out = paddle.mean(out,1)\n",
    "        out = self.linear(out)\n",
    "        return out\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "926c95f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = MyModel(dict_size)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "b46fcbad",
   "metadata": {},
   "outputs": [],
   "source": [
    "opt = paddle.optimizer.Adam(learning_rate=0.001,parameters=model.parameters())\n",
    "loss_fn = paddle.nn.CrossEntropyLoss()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e2366d3",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "997c169f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.8230083584785461\n",
      "0.7659845352172852\n",
      "0.7596207857131958\n",
      "0.7198231220245361\n",
      "0.7326130867004395\n",
      "0.7270877361297607\n",
      "0.7580116391181946\n",
      "0.7230734825134277\n",
      "0.8267865180969238\n",
      "0.7633288502693176\n",
      "0.6290515661239624\n",
      "0.75119948387146\n",
      "0.6527823209762573\n",
      "0.6014413237571716\n",
      "0.7892528176307678\n"
     ]
    }
   ],
   "source": [
    "for input_x,label in train_dataload:\n",
    "    \n",
    "    label = label.astype(paddle.int64)\n",
    "    out = model(input_x)\n",
    "    \n",
    "    loss = loss_fn(out,label)\n",
    "    print(loss.item())\n",
    "    loss.backward()\n",
    "    opt.step()\n",
    "    opt.clear_grad()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "4ca6b6a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "def precess(txt,word_to_id_dict):\n",
    "    temp = []\n",
    "    for i in jieba.cut(txt):\n",
    "        temp.append(word_to_id_dict.get(i,word_to_id_dict[\"unk\"]))\n",
    "    \n",
    "    \n",
    "    inputs = paddle.to_tensor([temp])\n",
    "    return inputs\n",
    "\n",
    "inputs = \"我非常的喜欢中国，我太爱中国了，中国是我的家\"\n",
    "\n",
    "out = model(precess(inputs,word_to_id_dict))\n",
    " \n",
    "\n",
    "\n",
    "        \n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "239d2693",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "中立\n"
     ]
    }
   ],
   "source": [
    "pre = paddle.argmax(out,-1).item()\n",
    "if pre==0:\n",
    "    print(\"消极\")\n",
    "elif pre==1:\n",
    "    print(\"中立\")\n",
    "else:\n",
    "    print(\"积极\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "53db3779",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
